/*
 * Copyright (c) 2021 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#if defined(__aarch64__)

#include "arm_gemm.hpp"
#include <cstdint>

namespace arm_conv {
namespace depthwise {

void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
{
  __asm__ __volatile__(
    "ldp x13, x12, [%x[inptrs], #0x0]\n"
    "add SP, SP, #-0x80\n"
    "ldp x11, x10, [%x[inptrs], #0x10]\n"
    "mov x19, #0x1\n"
    "ldp x9, x28, [%x[inptrs], #0x20]\n"
    "orr x19, x19, #0x100\n"
    "ldp x27, x26, [%x[inptrs], #0x30]\n"
    "orr x19, x19, #0x10000\n"
    "dup v11.4s, w19\n"
    "ldp x25, x24, [%x[outptrs], #0x0]\n"
    "mov x23, #0x0\n"
    "ldp x22, x21, [%x[outptrs], #0x10]\n"
    "lsr x20, %x[n_channels], #0x4\n"
    "add x19, %x[qp], %[offsetof_Requantize32_minval]\n"
    "ld1r { v9.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_maxval]\n"
    "ld1r { v12.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_b_offset]\n"
    "ld1r { v14.4s }, [x19]\n"
    "add x19, %x[qp], %[offsetof_Requantize32_c_offset]\n"
    "ld1r { v13.4s }, [x19]\n"
    "cbz x20, 2f\n"
    "1:"  // Loop
    "movi v15.4s, #0x0\n"
    "ldr q27, [x13, x23]\n"
    "subs x20, x20, #0x1\n"
    "movi v10.4s, #0x0\n"
    "ldr q1, [x12, x23]\n"
    "ldp x13, x12, [%x[inptrs], #0x40]\n"
    "ldr q25, [x11, x23]\n"
    "zip1 v7.16b, v27.16b, v25.16b\n"
    "ldr q23, [x10, x23]\n"
    "zip2 v5.16b, v27.16b, v25.16b\n"
    "ldp x11, x10, [%x[inptrs], #0x50]\n"
    "ldr q31, [x9, x23]\n"
    "zip1 v8.16b, v1.16b, v23.16b\n"
    "ldr q28, [x28, x23]\n"
    "zip2 v3.16b, v1.16b, v23.16b\n"
    "ldp x9, x28, [%x[inptrs], #0x60]\n"
    "zip1 v6.16b, v7.16b, v8.16b\n"
    "ldr q21, [x27, x23]\n"
    "zip2 v8.16b, v7.16b, v8.16b\n"
    "ldr q26, [x26, x23]\n"
    "zip1 v7.16b, v5.16b, v3.16b\n"
    "ldp x27, x26, [%x[inptrs], #0x70]\n"
    "zip2 v5.16b, v5.16b, v3.16b\n"
    "ldr q24, [x13, x23]\n"
    "ldr q22, [x12, x23]\n"
    "zip1 v2.16b, v31.16b, v21.16b\n"
    "zip2 v4.16b, v31.16b, v21.16b\n"
    "ldp x13, x12, [%x[inptrs], #0x0]\n"
    "zip1 v1.16b, v28.16b, v26.16b\n"
    "ldr q20, [x11, x23]\n"
    "zip2 v31.16b, v28.16b, v26.16b\n"
    "ldr q16, [x10, x23]\n"
    "zip1 v3.16b, v2.16b, v1.16b\n"
    "ldp x11, x10, [%x[inptrs], #0x10]\n"
    "zip2 v2.16b, v2.16b, v1.16b\n"
    "ldr q19, [x9, x23]\n"
    "zip1 v1.16b, v4.16b, v31.16b\n"
    "ldr q0, [x28, x23]\n"
    "zip1 v28.16b, v24.16b, v20.16b\n"
    "ldp x9, x28, [%x[inptrs], #0x20]\n"
    "zip2 v26.16b, v24.16b, v20.16b\n"
    "ldr q18, [x27, x23]\n"
    "zip1 v24.16b, v22.16b, v16.16b\n"
    "ldr q17, [x26, x23]\n"
    "zip2 v22.16b, v22.16b, v16.16b\n"
    "ldp x27, x26, [%x[inptrs], #0x30]\n"
    "zip2 v16.16b, v4.16b, v31.16b\n"
    "str q7, [SP, #0x0]\n"
    "zip1 v31.16b, v28.16b, v24.16b\n"
    "str q5, [SP, #0x10]\n"
    "zip1 v20.16b, v19.16b, v18.16b\n"
    "str q1, [SP, #0x20]\n"
    "zip2 v19.16b, v19.16b, v18.16b\n"
    "str q16, [SP, #0x30]\n"
    "zip1 v18.16b, v0.16b, v17.16b\n"
    "ldr q30, [%x[params], #0x0]\n"
    "zip2 v17.16b, v0.16b, v17.16b\n"
    "ldr q29, [%x[params], #0x10]\n"
    "zip2 v28.16b, v28.16b, v24.16b\n"
    "ldr q27, [%x[params], #0x20]\n"
    "zip1 v16.16b, v26.16b, v22.16b\n"
    "str q16, [SP, #0x40]\n"
    "zip2 v16.16b, v26.16b, v22.16b\n"
    "str q16, [SP, #0x50]\n"
    "zip1 v26.16b, v20.16b, v18.16b\n"
    "ldr q25, [%x[params], #0x30]\n"
    "zip2 v24.16b, v20.16b, v18.16b\n"
    "ldr q23, [%x[params], #0x40]\n"
    "zip1 v16.16b, v19.16b, v17.16b\n"
    "str q16, [SP, #0x60]\n"
    "zip2 v16.16b, v19.16b, v17.16b\n"
    "str q16, [SP, #0x70]\n"
    "mov v22.16b, v30.16b\n"
    "ldr q21, [%x[params], #0x50]\n"
    "mov v20.16b, v30.16b\n"
    "mov v19.16b, v30.16b\n"
    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
    "mov v17.16b, v15.16b\n"
    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
    "ldr q29, [%x[params], #0x70]\n"
    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
    "ldr q3, [SP, #0x20]\n"
    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
    "ldr q27, [%x[params], #0x80]\n"
    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
    "ldr q31, [SP, #0x40]\n"
    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
    "ldr q25, [%x[params], #0x90]\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
    "ldr q6, [SP, #0x0]\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "ldr q26, [SP, #0x60]\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "movi v15.4s, #0x0\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "ldr q23, [%x[params], #0xa0]\n"
    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "mov v17.16b, v15.16b\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x25, x23]\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "ldr q30, [%x[params], #0x60]\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "ldr q21, [%x[params], #0xb0]\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x22, x23]\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x24, x23]\n"
    "mov v22.16b, v30.16b\n"
    "mov v20.16b, v30.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x21, x23]\n"
    "mov v19.16b, v30.16b\n"
    "add x23, x23, #0x4\n"
    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
    "movi v10.4s, #0x0\n"
    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
    "ldr q29, [%x[params], #0xd0]\n"
    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
    "ldr q2, [SP, #0x30]\n"
    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
    "ldr q27, [%x[params], #0xe0]\n"
    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
    "ldr q28, [SP, #0x50]\n"
    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
    "ldr q25, [%x[params], #0xf0]\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
    "ldr q8, [SP, #0x10]\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "ldr q24, [SP, #0x70]\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "movi v15.4s, #0x0\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
    "movi v10.4s, #0x0\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "ldr q23, [%x[params], #0x100]\n"
    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "mov v17.16b, v15.16b\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "ldr q21, [%x[params], #0x110]\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x25, x23]\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "ldr q30, [%x[params], #0xc0]\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "str s20, [x22, x23]\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x24, x23]\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
    "mov v22.16b, v30.16b\n"
    "mov v20.16b, v30.16b\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x21, x23]\n"
    "mov v19.16b, v30.16b\n"
    "add x23, x23, #0x4\n"
    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
    "ldr q29, [%x[params], #0x130]\n"
    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
    "ldr q27, [%x[params], #0x140]\n"
    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
    "ldr q25, [%x[params], #0x150]\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "movi v15.4s, #0x0\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "movi v10.4s, #0x0\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "ldr q23, [%x[params], #0x160]\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "mov v17.16b, v15.16b\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "ldr q21, [%x[params], #0x170]\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x25, x23]\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "ldr q30, [%x[params], #0x120]\n"
    "add %x[params], %x[params], #0x180\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x22, x23]\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x24, x23]\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "mov v22.16b, v30.16b\n"
    "mov v20.16b, v30.16b\n"
    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x21, x23]\n"
    "mov v19.16b, v30.16b\n"
    "add x23, x23, #0x4\n"
    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "str s30, [x25, x23]\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "str s22, [x24, x23]\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "str s20, [x22, x23]\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "str s19, [x21, x23]\n"
    "add x23, x23, #0x4\n"
    "bgt 1b\n"
    "tst %x[n_channels], #0xf\n"
    "beq 34f\n"
    "2:"  // Oddments
    "and x19, %x[n_channels], #0xf\n"
    "add x13, x13, x23\n"
    "add x12, x12, x23\n"
    "add x11, x11, x23\n"
    "add x10, x10, x23\n"
    "add x9, x9, x23\n"
    "add x28, x28, x23\n"
    "add x27, x27, x23\n"
    "add x26, x26, x23\n"
    "tbz %x[n_channels], #3, 6f\n"
    "ld1 { v27.d }[0], [x13], #0x8\n"
    "ld1 { v1.d }[0], [x12], #0x8\n"
    "ld1 { v25.d }[0], [x11], #0x8\n"
    "ld1 { v23.d }[0], [x10], #0x8\n"
    "ld1 { v31.d }[0], [x9], #0x8\n"
    "ld1 { v28.d }[0], [x28], #0x8\n"
    "ld1 { v21.d }[0], [x27], #0x8\n"
    "ld1 { v26.d }[0], [x26], #0x8\n"
    "tbz %x[n_channels], #2, 4f\n"
    "ld1 { v27.s }[2], [x13], #0x4\n"
    "ld1 { v1.s }[2], [x12], #0x4\n"
    "ld1 { v25.s }[2], [x11], #0x4\n"
    "ld1 { v23.s }[2], [x10], #0x4\n"
    "ld1 { v31.s }[2], [x9], #0x4\n"
    "ld1 { v28.s }[2], [x28], #0x4\n"
    "ld1 { v21.s }[2], [x27], #0x4\n"
    "ld1 { v26.s }[2], [x26], #0x4\n"
    "tbz %x[n_channels], #1, 3f\n"
    "ld1 { v27.h }[6], [x13], #0x2\n"
    "ld1 { v1.h }[6], [x12], #0x2\n"
    "ld1 { v25.h }[6], [x11], #0x2\n"
    "ld1 { v23.h }[6], [x10], #0x2\n"
    "ld1 { v31.h }[6], [x9], #0x2\n"
    "ld1 { v28.h }[6], [x28], #0x2\n"
    "ld1 { v21.h }[6], [x27], #0x2\n"
    "ld1 { v26.h }[6], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[14], [x13], #0x1\n"
    "ld1 { v1.b }[14], [x12], #0x1\n"
    "ld1 { v25.b }[14], [x11], #0x1\n"
    "ld1 { v23.b }[14], [x10], #0x1\n"
    "ld1 { v31.b }[14], [x9], #0x1\n"
    "ld1 { v28.b }[14], [x28], #0x1\n"
    "ld1 { v21.b }[14], [x27], #0x1\n"
    "ld1 { v26.b }[14], [x26], #0x1\n"
    "b 10f\n"
    "3:"  // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[12], [x13], #0x1\n"
    "ld1 { v1.b }[12], [x12], #0x1\n"
    "ld1 { v25.b }[12], [x11], #0x1\n"
    "ld1 { v23.b }[12], [x10], #0x1\n"
    "ld1 { v31.b }[12], [x9], #0x1\n"
    "ld1 { v28.b }[12], [x28], #0x1\n"
    "ld1 { v21.b }[12], [x27], #0x1\n"
    "ld1 { v26.b }[12], [x26], #0x1\n"
    "b 10f\n"
    "4:"  // Oddments: Load (A): Bit 3: Bit 2: Unset
    "tbz %x[n_channels], #1, 5f\n"
    "ld1 { v27.h }[4], [x13], #0x2\n"
    "ld1 { v1.h }[4], [x12], #0x2\n"
    "ld1 { v25.h }[4], [x11], #0x2\n"
    "ld1 { v23.h }[4], [x10], #0x2\n"
    "ld1 { v31.h }[4], [x9], #0x2\n"
    "ld1 { v28.h }[4], [x28], #0x2\n"
    "ld1 { v21.h }[4], [x27], #0x2\n"
    "ld1 { v26.h }[4], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[10], [x13], #0x1\n"
    "ld1 { v1.b }[10], [x12], #0x1\n"
    "ld1 { v25.b }[10], [x11], #0x1\n"
    "ld1 { v23.b }[10], [x10], #0x1\n"
    "ld1 { v31.b }[10], [x9], #0x1\n"
    "ld1 { v28.b }[10], [x28], #0x1\n"
    "ld1 { v21.b }[10], [x27], #0x1\n"
    "ld1 { v26.b }[10], [x26], #0x1\n"
    "b 10f\n"
    "5:"  // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[8], [x13], #0x1\n"
    "ld1 { v1.b }[8], [x12], #0x1\n"
    "ld1 { v25.b }[8], [x11], #0x1\n"
    "ld1 { v23.b }[8], [x10], #0x1\n"
    "ld1 { v31.b }[8], [x9], #0x1\n"
    "ld1 { v28.b }[8], [x28], #0x1\n"
    "ld1 { v21.b }[8], [x27], #0x1\n"
    "ld1 { v26.b }[8], [x26], #0x1\n"
    "b 10f\n"
    "6:"  // Oddments: Load (A): Bit 3: Unset
    "tbz %x[n_channels], #2, 8f\n"
    "ld1 { v27.s }[0], [x13], #0x4\n"
    "ld1 { v1.s }[0], [x12], #0x4\n"
    "ld1 { v25.s }[0], [x11], #0x4\n"
    "ld1 { v23.s }[0], [x10], #0x4\n"
    "ld1 { v31.s }[0], [x9], #0x4\n"
    "ld1 { v28.s }[0], [x28], #0x4\n"
    "ld1 { v21.s }[0], [x27], #0x4\n"
    "ld1 { v26.s }[0], [x26], #0x4\n"
    "tbz %x[n_channels], #1, 7f\n"
    "ld1 { v27.h }[2], [x13], #0x2\n"
    "ld1 { v1.h }[2], [x12], #0x2\n"
    "ld1 { v25.h }[2], [x11], #0x2\n"
    "ld1 { v23.h }[2], [x10], #0x2\n"
    "ld1 { v31.h }[2], [x9], #0x2\n"
    "ld1 { v28.h }[2], [x28], #0x2\n"
    "ld1 { v21.h }[2], [x27], #0x2\n"
    "ld1 { v26.h }[2], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[6], [x13], #0x1\n"
    "ld1 { v1.b }[6], [x12], #0x1\n"
    "ld1 { v25.b }[6], [x11], #0x1\n"
    "ld1 { v23.b }[6], [x10], #0x1\n"
    "ld1 { v31.b }[6], [x9], #0x1\n"
    "ld1 { v28.b }[6], [x28], #0x1\n"
    "ld1 { v21.b }[6], [x27], #0x1\n"
    "ld1 { v26.b }[6], [x26], #0x1\n"
    "b 10f\n"
    "7:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[4], [x13], #0x1\n"
    "ld1 { v1.b }[4], [x12], #0x1\n"
    "ld1 { v25.b }[4], [x11], #0x1\n"
    "ld1 { v23.b }[4], [x10], #0x1\n"
    "ld1 { v31.b }[4], [x9], #0x1\n"
    "ld1 { v28.b }[4], [x28], #0x1\n"
    "ld1 { v21.b }[4], [x27], #0x1\n"
    "ld1 { v26.b }[4], [x26], #0x1\n"
    "b 10f\n"
    "8:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
    "tbz %x[n_channels], #1, 9f\n"
    "ld1 { v27.h }[0], [x13], #0x2\n"
    "ld1 { v1.h }[0], [x12], #0x2\n"
    "ld1 { v25.h }[0], [x11], #0x2\n"
    "ld1 { v23.h }[0], [x10], #0x2\n"
    "ld1 { v31.h }[0], [x9], #0x2\n"
    "ld1 { v28.h }[0], [x28], #0x2\n"
    "ld1 { v21.h }[0], [x27], #0x2\n"
    "ld1 { v26.h }[0], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[2], [x13], #0x1\n"
    "ld1 { v1.b }[2], [x12], #0x1\n"
    "ld1 { v25.b }[2], [x11], #0x1\n"
    "ld1 { v23.b }[2], [x10], #0x1\n"
    "ld1 { v31.b }[2], [x9], #0x1\n"
    "ld1 { v28.b }[2], [x28], #0x1\n"
    "ld1 { v21.b }[2], [x27], #0x1\n"
    "ld1 { v26.b }[2], [x26], #0x1\n"
    "b 10f\n"
    "9:"  // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 10f\n"
    "ld1 { v27.b }[0], [x13], #0x1\n"
    "ld1 { v1.b }[0], [x12], #0x1\n"
    "ld1 { v25.b }[0], [x11], #0x1\n"
    "ld1 { v23.b }[0], [x10], #0x1\n"
    "ld1 { v31.b }[0], [x9], #0x1\n"
    "ld1 { v28.b }[0], [x28], #0x1\n"
    "ld1 { v21.b }[0], [x27], #0x1\n"
    "ld1 { v26.b }[0], [x26], #0x1\n"
    "10:"  // Oddments: Load (A): Bit 3: End
    "ldp x13, x12, [%x[inptrs], #0x40]\n"
    "add x13, x13, x23\n"
    "ldp x11, x10, [%x[inptrs], #0x50]\n"
    "ldp x9, x28, [%x[inptrs], #0x60]\n"
    "add x12, x12, x23\n"
    "ldp x27, x26, [%x[inptrs], #0x70]\n"
    "add x11, x11, x23\n"
    "add x10, x10, x23\n"
    "add x9, x9, x23\n"
    "add x28, x28, x23\n"
    "add x27, x27, x23\n"
    "add x26, x26, x23\n"
    "tbz %x[n_channels], #3, 14f\n"
    "ld1 { v24.d }[0], [x13], #0x8\n"
    "ld1 { v22.d }[0], [x12], #0x8\n"
    "ld1 { v20.d }[0], [x11], #0x8\n"
    "ld1 { v16.d }[0], [x10], #0x8\n"
    "ld1 { v19.d }[0], [x9], #0x8\n"
    "ld1 { v0.d }[0], [x28], #0x8\n"
    "ld1 { v18.d }[0], [x27], #0x8\n"
    "ld1 { v17.d }[0], [x26], #0x8\n"
    "tbz %x[n_channels], #2, 12f\n"
    "ld1 { v24.s }[2], [x13], #0x4\n"
    "ld1 { v22.s }[2], [x12], #0x4\n"
    "ld1 { v20.s }[2], [x11], #0x4\n"
    "ld1 { v16.s }[2], [x10], #0x4\n"
    "ld1 { v19.s }[2], [x9], #0x4\n"
    "ld1 { v0.s }[2], [x28], #0x4\n"
    "ld1 { v18.s }[2], [x27], #0x4\n"
    "ld1 { v17.s }[2], [x26], #0x4\n"
    "tbz %x[n_channels], #1, 11f\n"
    "ld1 { v24.h }[6], [x13], #0x2\n"
    "ld1 { v22.h }[6], [x12], #0x2\n"
    "ld1 { v20.h }[6], [x11], #0x2\n"
    "ld1 { v16.h }[6], [x10], #0x2\n"
    "ld1 { v19.h }[6], [x9], #0x2\n"
    "ld1 { v0.h }[6], [x28], #0x2\n"
    "ld1 { v18.h }[6], [x27], #0x2\n"
    "ld1 { v17.h }[6], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[14], [x13], #0x1\n"
    "ld1 { v22.b }[14], [x12], #0x1\n"
    "ld1 { v20.b }[14], [x11], #0x1\n"
    "ld1 { v16.b }[14], [x10], #0x1\n"
    "ld1 { v19.b }[14], [x9], #0x1\n"
    "ld1 { v0.b }[14], [x28], #0x1\n"
    "ld1 { v18.b }[14], [x27], #0x1\n"
    "ld1 { v17.b }[14], [x26], #0x1\n"
    "b 18f\n"
    "11:"  // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[12], [x13], #0x1\n"
    "ld1 { v22.b }[12], [x12], #0x1\n"
    "ld1 { v20.b }[12], [x11], #0x1\n"
    "ld1 { v16.b }[12], [x10], #0x1\n"
    "ld1 { v19.b }[12], [x9], #0x1\n"
    "ld1 { v0.b }[12], [x28], #0x1\n"
    "ld1 { v18.b }[12], [x27], #0x1\n"
    "ld1 { v17.b }[12], [x26], #0x1\n"
    "b 18f\n"
    "12:"  // Oddments: Load (B): Bit 3: Bit 2: Unset
    "tbz %x[n_channels], #1, 13f\n"
    "ld1 { v24.h }[4], [x13], #0x2\n"
    "ld1 { v22.h }[4], [x12], #0x2\n"
    "ld1 { v20.h }[4], [x11], #0x2\n"
    "ld1 { v16.h }[4], [x10], #0x2\n"
    "ld1 { v19.h }[4], [x9], #0x2\n"
    "ld1 { v0.h }[4], [x28], #0x2\n"
    "ld1 { v18.h }[4], [x27], #0x2\n"
    "ld1 { v17.h }[4], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[10], [x13], #0x1\n"
    "ld1 { v22.b }[10], [x12], #0x1\n"
    "ld1 { v20.b }[10], [x11], #0x1\n"
    "ld1 { v16.b }[10], [x10], #0x1\n"
    "ld1 { v19.b }[10], [x9], #0x1\n"
    "ld1 { v0.b }[10], [x28], #0x1\n"
    "ld1 { v18.b }[10], [x27], #0x1\n"
    "ld1 { v17.b }[10], [x26], #0x1\n"
    "b 18f\n"
    "13:"  // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[8], [x13], #0x1\n"
    "ld1 { v22.b }[8], [x12], #0x1\n"
    "ld1 { v20.b }[8], [x11], #0x1\n"
    "ld1 { v16.b }[8], [x10], #0x1\n"
    "ld1 { v19.b }[8], [x9], #0x1\n"
    "ld1 { v0.b }[8], [x28], #0x1\n"
    "ld1 { v18.b }[8], [x27], #0x1\n"
    "ld1 { v17.b }[8], [x26], #0x1\n"
    "b 18f\n"
    "14:"  // Oddments: Load (B): Bit 3: Unset
    "tbz %x[n_channels], #2, 16f\n"
    "ld1 { v24.s }[0], [x13], #0x4\n"
    "ld1 { v22.s }[0], [x12], #0x4\n"
    "ld1 { v20.s }[0], [x11], #0x4\n"
    "ld1 { v16.s }[0], [x10], #0x4\n"
    "ld1 { v19.s }[0], [x9], #0x4\n"
    "ld1 { v0.s }[0], [x28], #0x4\n"
    "ld1 { v18.s }[0], [x27], #0x4\n"
    "ld1 { v17.s }[0], [x26], #0x4\n"
    "tbz %x[n_channels], #1, 15f\n"
    "ld1 { v24.h }[2], [x13], #0x2\n"
    "ld1 { v22.h }[2], [x12], #0x2\n"
    "ld1 { v20.h }[2], [x11], #0x2\n"
    "ld1 { v16.h }[2], [x10], #0x2\n"
    "ld1 { v19.h }[2], [x9], #0x2\n"
    "ld1 { v0.h }[2], [x28], #0x2\n"
    "ld1 { v18.h }[2], [x27], #0x2\n"
    "ld1 { v17.h }[2], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[6], [x13], #0x1\n"
    "ld1 { v22.b }[6], [x12], #0x1\n"
    "ld1 { v20.b }[6], [x11], #0x1\n"
    "ld1 { v16.b }[6], [x10], #0x1\n"
    "ld1 { v19.b }[6], [x9], #0x1\n"
    "ld1 { v0.b }[6], [x28], #0x1\n"
    "ld1 { v18.b }[6], [x27], #0x1\n"
    "ld1 { v17.b }[6], [x26], #0x1\n"
    "b 18f\n"
    "15:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[4], [x13], #0x1\n"
    "ld1 { v22.b }[4], [x12], #0x1\n"
    "ld1 { v20.b }[4], [x11], #0x1\n"
    "ld1 { v16.b }[4], [x10], #0x1\n"
    "ld1 { v19.b }[4], [x9], #0x1\n"
    "ld1 { v0.b }[4], [x28], #0x1\n"
    "ld1 { v18.b }[4], [x27], #0x1\n"
    "ld1 { v17.b }[4], [x26], #0x1\n"
    "b 18f\n"
    "16:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
    "tbz %x[n_channels], #1, 17f\n"
    "ld1 { v24.h }[0], [x13], #0x2\n"
    "ld1 { v22.h }[0], [x12], #0x2\n"
    "ld1 { v20.h }[0], [x11], #0x2\n"
    "ld1 { v16.h }[0], [x10], #0x2\n"
    "ld1 { v19.h }[0], [x9], #0x2\n"
    "ld1 { v0.h }[0], [x28], #0x2\n"
    "ld1 { v18.h }[0], [x27], #0x2\n"
    "ld1 { v17.h }[0], [x26], #0x2\n"
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[2], [x13], #0x1\n"
    "ld1 { v22.b }[2], [x12], #0x1\n"
    "ld1 { v20.b }[2], [x11], #0x1\n"
    "ld1 { v16.b }[2], [x10], #0x1\n"
    "ld1 { v19.b }[2], [x9], #0x1\n"
    "ld1 { v0.b }[2], [x28], #0x1\n"
    "ld1 { v18.b }[2], [x27], #0x1\n"
    "ld1 { v17.b }[2], [x26], #0x1\n"
    "b 18f\n"
    "17:"  // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
    "tbz %x[n_channels], #0, 18f\n"
    "ld1 { v24.b }[0], [x13], #0x1\n"
    "ld1 { v22.b }[0], [x12], #0x1\n"
    "ld1 { v20.b }[0], [x11], #0x1\n"
    "ld1 { v16.b }[0], [x10], #0x1\n"
    "ld1 { v19.b }[0], [x9], #0x1\n"
    "ld1 { v0.b }[0], [x28], #0x1\n"
    "ld1 { v18.b }[0], [x27], #0x1\n"
    "ld1 { v17.b }[0], [x26], #0x1\n"
    "18:"  // Oddments: Load (B): Bit 3: End
    "zip1 v7.16b, v27.16b, v25.16b\n"
    "ldr q30, [%x[params], #0x0]\n"
    "cmp x19, #0x4\n"
    "zip2 v5.16b, v27.16b, v25.16b\n"
    "ldr q29, [%x[params], #0x10]\n"
    "zip1 v8.16b, v1.16b, v23.16b\n"
    "ldr q27, [%x[params], #0x20]\n"
    "zip2 v3.16b, v1.16b, v23.16b\n"
    "ldr q25, [%x[params], #0x30]\n"
    "zip1 v2.16b, v31.16b, v21.16b\n"
    "ldr q23, [%x[params], #0x40]\n"
    "zip2 v4.16b, v31.16b, v21.16b\n"
    "ldr q21, [%x[params], #0x50]\n"
    "add %x[params], %x[params], #0x60\n"
    "zip1 v1.16b, v28.16b, v26.16b\n"
    "zip2 v31.16b, v28.16b, v26.16b\n"
    "zip1 v28.16b, v24.16b, v20.16b\n"
    "zip2 v26.16b, v24.16b, v20.16b\n"
    "zip1 v24.16b, v22.16b, v16.16b\n"
    "zip2 v22.16b, v22.16b, v16.16b\n"
    "zip1 v20.16b, v19.16b, v18.16b\n"
    "zip2 v19.16b, v19.16b, v18.16b\n"
    "zip1 v18.16b, v0.16b, v17.16b\n"
    "zip2 v17.16b, v0.16b, v17.16b\n"
    "zip1 v6.16b, v7.16b, v8.16b\n"
    "zip2 v8.16b, v7.16b, v8.16b\n"
    "zip1 v7.16b, v5.16b, v3.16b\n"
    "str q7, [SP, #0x0]\n"
    "zip2 v5.16b, v5.16b, v3.16b\n"
    "str q5, [SP, #0x10]\n"
    "zip1 v3.16b, v2.16b, v1.16b\n"
    "zip2 v2.16b, v2.16b, v1.16b\n"
    "zip1 v1.16b, v4.16b, v31.16b\n"
    "str q1, [SP, #0x20]\n"
    "zip2 v16.16b, v4.16b, v31.16b\n"
    "str q16, [SP, #0x30]\n"
    "zip1 v31.16b, v28.16b, v24.16b\n"
    "zip2 v28.16b, v28.16b, v24.16b\n"
    "zip1 v16.16b, v26.16b, v22.16b\n"
    "str q16, [SP, #0x40]\n"
    "zip2 v16.16b, v26.16b, v22.16b\n"
    "str q16, [SP, #0x50]\n"
    "zip1 v26.16b, v20.16b, v18.16b\n"
    "zip2 v24.16b, v20.16b, v18.16b\n"
    "zip1 v16.16b, v19.16b, v17.16b\n"
    "str q16, [SP, #0x60]\n"
    "zip2 v16.16b, v19.16b, v17.16b\n"
    "str q16, [SP, #0x70]\n"
    "mov v22.16b, v30.16b\n"
    "mov v20.16b, v30.16b\n"
    "mov v19.16b, v30.16b\n"
    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
    "movi v15.4s, #0x0\n"
    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
    "mov v17.16b, v15.16b\n"
    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
    "movi v10.4s, #0x0\n"
    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "blt 19f\n"
    "str s30, [x25, x23]\n"
    "str s22, [x24, x23]\n"
    "str s20, [x22, x23]\n"
    "str s19, [x21, x23]\n"
    "b 22f\n"
    "19:"  // Oddments: Unroll 0: Oddment store
    "add x25, x25, x23\n"
    "add x24, x24, x23\n"
    "add x22, x22, x23\n"
    "add x21, x21, x23\n"
    "tbz x19, #1, 20f\n"
    "st1 { v30.h }[0], [x25], #0x2\n"
    "st1 { v22.h }[0], [x24], #0x2\n"
    "st1 { v20.h }[0], [x22], #0x2\n"
    "st1 { v19.h }[0], [x21], #0x2\n"
    "tbz x19, #0, 21f\n"
    "st1 { v30.b }[2], [x25], #0x1\n"
    "st1 { v22.b }[2], [x24], #0x1\n"
    "st1 { v20.b }[2], [x22], #0x1\n"
    "st1 { v19.b }[2], [x21], #0x1\n"
    "b 21f\n"
    "20:"  // Oddments: Unroll 0: Oddment store: Bit 1: Unset
    "tbz x19, #0, 21f\n"
    "st1 { v30.b }[0], [x25], #0x1\n"
    "st1 { v22.b }[0], [x24], #0x1\n"
    "st1 { v20.b }[0], [x22], #0x1\n"
    "st1 { v19.b }[0], [x21], #0x1\n"
    "21:"  // Oddments: Unroll 0: Oddment store: Bit 1: End

    "22:"  // Oddments: Unroll 0: After oddment store
    "add x23, x23, #0x4\n"
    "subs x19, x19, #0x4\n"
    "ble 34f\n"
    "movi v15.4s, #0x0\n"
    "ldr q30, [%x[params], #0x0]\n"
    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
    "ldr q29, [%x[params], #0x10]\n"
    "cmp x19, #0x4\n"
    "movi v10.4s, #0x0\n"
    "ldr q27, [%x[params], #0x20]\n"
    "ldr q25, [%x[params], #0x30]\n"
    "mov v22.16b, v30.16b\n"
    "ldr q23, [%x[params], #0x40]\n"
    "mov v20.16b, v30.16b\n"
    "ldr q21, [%x[params], #0x50]\n"
    "add %x[params], %x[params], #0x60\n"
    "mov v19.16b, v30.16b\n"
    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
    "mov v17.16b, v15.16b\n"
    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "blt 23f\n"
    "str s30, [x25, x23]\n"
    "str s22, [x24, x23]\n"
    "str s20, [x22, x23]\n"
    "str s19, [x21, x23]\n"
    "b 26f\n"
    "23:"  // Oddments: Unroll 1: Oddment store
    "add x25, x25, x23\n"
    "add x24, x24, x23\n"
    "add x22, x22, x23\n"
    "add x21, x21, x23\n"
    "tbz x19, #1, 24f\n"
    "st1 { v30.h }[0], [x25], #0x2\n"
    "st1 { v22.h }[0], [x24], #0x2\n"
    "st1 { v20.h }[0], [x22], #0x2\n"
    "st1 { v19.h }[0], [x21], #0x2\n"
    "tbz x19, #0, 25f\n"
    "st1 { v30.b }[2], [x25], #0x1\n"
    "st1 { v22.b }[2], [x24], #0x1\n"
    "st1 { v20.b }[2], [x22], #0x1\n"
    "st1 { v19.b }[2], [x21], #0x1\n"
    "b 25f\n"
    "24:"  // Oddments: Unroll 1: Oddment store: Bit 1: Unset
    "tbz x19, #0, 25f\n"
    "st1 { v30.b }[0], [x25], #0x1\n"
    "st1 { v22.b }[0], [x24], #0x1\n"
    "st1 { v20.b }[0], [x22], #0x1\n"
    "st1 { v19.b }[0], [x21], #0x1\n"
    "25:"  // Oddments: Unroll 1: Oddment store: Bit 1: End

    "26:"  // Oddments: Unroll 1: After oddment store
    "add x23, x23, #0x4\n"
    "subs x19, x19, #0x4\n"
    "ble 34f\n"
    "movi v15.4s, #0x0\n"
    "ldr q6, [SP, #0x0]\n"
    "movi v10.4s, #0x0\n"
    "ldr q3, [SP, #0x20]\n"
    "cmp x19, #0x4\n"
    ".inst 0x4e83956f  // sdot v15.4s, v11.16b, v3.16b\n"
    "ldr q31, [SP, #0x40]\n"
    "ldr q26, [SP, #0x60]\n"
    ".inst 0x4e9f956f  // sdot v15.4s, v11.16b, v31.16b\n"
    "ldr q30, [%x[params], #0x0]\n"
    "ldr q29, [%x[params], #0x10]\n"
    "mov v22.16b, v30.16b\n"
    "ldr q27, [%x[params], #0x20]\n"
    "mov v20.16b, v30.16b\n"
    "ldr q25, [%x[params], #0x30]\n"
    "mov v19.16b, v30.16b\n"
    "ldr q23, [%x[params], #0x40]\n"
    ".inst 0x4e8697be  // sdot v30.4s, v29.16b, v6.16b\n"
    "ldr q21, [%x[params], #0x50]\n"
    "add %x[params], %x[params], #0x60\n"
    ".inst 0x4e8397b4  // sdot v20.4s, v29.16b, v3.16b\n"
    "mov v17.16b, v15.16b\n"
    ".inst 0x4e86956f  // sdot v15.4s, v11.16b, v6.16b\n"
    ".inst 0x4e83977e  // sdot v30.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    ".inst 0x4e9f9774  // sdot v20.4s, v27.16b, v31.16b\n"
    "ext v6.16b, v6.16b, v6.16b, #0x1\n"
    ".inst 0x4e9f973e  // sdot v30.4s, v25.16b, v31.16b\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    ".inst 0x4e9a9734  // sdot v20.4s, v25.16b, v26.16b\n"
    "ext v3.16b, v3.16b, v3.16b, #0x1\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    "ext v31.16b, v31.16b, v31.16b, #0x1\n"
    "ext v26.16b, v26.16b, v26.16b, #0x1\n"
    ".inst 0x4e8697b6  // sdot v22.4s, v29.16b, v6.16b\n"
    ".inst 0x4e8397b3  // sdot v19.4s, v29.16b, v3.16b\n"
    ".inst 0x4e83956a  // sdot v10.4s, v11.16b, v3.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e839776  // sdot v22.4s, v27.16b, v3.16b\n"
    ".inst 0x4e9f9773  // sdot v19.4s, v27.16b, v31.16b\n"
    ".inst 0x4e9f956a  // sdot v10.4s, v11.16b, v31.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    ".inst 0x4e9f9736  // sdot v22.4s, v25.16b, v31.16b\n"
    ".inst 0x4e9a9733  // sdot v19.4s, v25.16b, v26.16b\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e86956a  // sdot v10.4s, v11.16b, v6.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e9a9571  // sdot v17.4s, v11.16b, v26.16b\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "blt 27f\n"
    "str s30, [x25, x23]\n"
    "str s22, [x24, x23]\n"
    "str s20, [x22, x23]\n"
    "str s19, [x21, x23]\n"
    "b 30f\n"
    "27:"  // Oddments: Unroll 2: Oddment store
    "add x25, x25, x23\n"
    "add x24, x24, x23\n"
    "add x22, x22, x23\n"
    "add x21, x21, x23\n"
    "tbz x19, #1, 28f\n"
    "st1 { v30.h }[0], [x25], #0x2\n"
    "st1 { v22.h }[0], [x24], #0x2\n"
    "st1 { v20.h }[0], [x22], #0x2\n"
    "st1 { v19.h }[0], [x21], #0x2\n"
    "tbz x19, #0, 29f\n"
    "st1 { v30.b }[2], [x25], #0x1\n"
    "st1 { v22.b }[2], [x24], #0x1\n"
    "st1 { v20.b }[2], [x22], #0x1\n"
    "st1 { v19.b }[2], [x21], #0x1\n"
    "b 29f\n"
    "28:"  // Oddments: Unroll 2: Oddment store: Bit 1: Unset
    "tbz x19, #0, 29f\n"
    "st1 { v30.b }[0], [x25], #0x1\n"
    "st1 { v22.b }[0], [x24], #0x1\n"
    "st1 { v20.b }[0], [x22], #0x1\n"
    "st1 { v19.b }[0], [x21], #0x1\n"
    "29:"  // Oddments: Unroll 2: Oddment store: Bit 1: End

    "30:"  // Oddments: Unroll 2: After oddment store
    "add x23, x23, #0x4\n"
    "subs x19, x19, #0x4\n"
    "ble 34f\n"
    "movi v15.4s, #0x0\n"
    "ldr q8, [SP, #0x10]\n"
    "movi v10.4s, #0x0\n"
    "ldr q2, [SP, #0x30]\n"
    "ldr q28, [SP, #0x50]\n"
    ".inst 0x4e82956f  // sdot v15.4s, v11.16b, v2.16b\n"
    "ldr q24, [SP, #0x70]\n"
    "ldr q30, [%x[params], #0x0]\n"
    "mov v22.16b, v30.16b\n"
    "ldr q29, [%x[params], #0x10]\n"
    "mov v20.16b, v30.16b\n"
    "ldr q27, [%x[params], #0x20]\n"
    "mov v19.16b, v30.16b\n"
    "ldr q25, [%x[params], #0x30]\n"
    ".inst 0x4e9c956f  // sdot v15.4s, v11.16b, v28.16b\n"
    "ldr q23, [%x[params], #0x40]\n"
    "ldr q21, [%x[params], #0x50]\n"
    ".inst 0x4e8897be  // sdot v30.4s, v29.16b, v8.16b\n"
    "add %x[params], %x[params], #0x60\n"
    ".inst 0x4e8297b4  // sdot v20.4s, v29.16b, v2.16b\n"
    "mov v17.16b, v15.16b\n"
    ".inst 0x4e88956f  // sdot v15.4s, v11.16b, v8.16b\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    ".inst 0x4e82977e  // sdot v30.4s, v27.16b, v2.16b\n"
    ".inst 0x4e9c9774  // sdot v20.4s, v27.16b, v28.16b\n"
    "ext v8.16b, v8.16b, v8.16b, #0x1\n"
    "ext v2.16b, v2.16b, v2.16b, #0x1\n"
    ".inst 0x4e9c973e  // sdot v30.4s, v25.16b, v28.16b\n"
    "mls v30.4s, v15.4s, v14.4s\n"
    ".inst 0x4e989734  // sdot v20.4s, v25.16b, v24.16b\n"
    "ext v28.16b, v28.16b, v28.16b, #0x1\n"
    "mls v20.4s, v17.4s, v14.4s\n"
    "ext v24.16b, v24.16b, v24.16b, #0x1\n"
    ".inst 0x4e8897b6  // sdot v22.4s, v29.16b, v8.16b\n"
    ".inst 0x4e8297b3  // sdot v19.4s, v29.16b, v2.16b\n"
    ".inst 0x4e82956a  // sdot v10.4s, v11.16b, v2.16b\n"
    "sqrdmulh v30.4s, v30.4s, v23.4s\n"
    ".inst 0x4e829776  // sdot v22.4s, v27.16b, v2.16b\n"
    ".inst 0x4e9c9773  // sdot v19.4s, v27.16b, v28.16b\n"
    ".inst 0x4e9c956a  // sdot v10.4s, v11.16b, v28.16b\n"
    "and v18.16b, v30.16b, v21.16b\n"
    "sshr v18.4s, v18.4s, #0x1f\n"
    ".inst 0x4e9c9736  // sdot v22.4s, v25.16b, v28.16b\n"
    ".inst 0x4e989733  // sdot v19.4s, v25.16b, v24.16b\n"
    "mov v17.16b, v10.16b\n"
    ".inst 0x4e88956a  // sdot v10.4s, v11.16b, v8.16b\n"
    "mls v22.4s, v10.4s, v14.4s\n"
    ".inst 0x4e989571  // sdot v17.4s, v11.16b, v24.16b\n"
    "sqadd v30.4s, v30.4s, v18.4s\n"
    "mls v19.4s, v17.4s, v14.4s\n"
    "srshl v30.4s, v30.4s, v21.4s\n"
    "sqrdmulh v20.4s, v20.4s, v23.4s\n"
    "sqrdmulh v22.4s, v22.4s, v23.4s\n"
    "add v30.4s, v30.4s, v13.4s\n"
    "and v16.16b, v20.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "smax v30.4s, v30.4s, v9.4s\n"
    "and v17.16b, v22.16b, v21.16b\n"
    "sshr v17.4s, v17.4s, #0x1f\n"
    "smin v30.4s, v30.4s, v12.4s\n"
    "sqrdmulh v19.4s, v19.4s, v23.4s\n"
    "sqadd v20.4s, v20.4s, v16.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "and v16.16b, v19.16b, v21.16b\n"
    "sshr v16.4s, v16.4s, #0x1f\n"
    "sqadd v22.4s, v22.4s, v17.4s\n"
    "srshl v20.4s, v20.4s, v21.4s\n"
    "uzp1 v30.16b, v30.16b, v30.16b\n"
    "srshl v22.4s, v22.4s, v21.4s\n"
    "add v20.4s, v20.4s, v13.4s\n"
    "sqadd v19.4s, v19.4s, v16.4s\n"
    "smax v20.4s, v20.4s, v9.4s\n"
    "add v22.4s, v22.4s, v13.4s\n"
    "srshl v19.4s, v19.4s, v21.4s\n"
    "smin v20.4s, v20.4s, v12.4s\n"
    "smax v22.4s, v22.4s, v9.4s\n"
    "add v19.4s, v19.4s, v13.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smin v22.4s, v22.4s, v12.4s\n"
    "uzp1 v20.16b, v20.16b, v20.16b\n"
    "smax v19.4s, v19.4s, v9.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "smin v19.4s, v19.4s, v12.4s\n"
    "uzp1 v22.16b, v22.16b, v22.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "uzp1 v19.16b, v19.16b, v19.16b\n"
    "31:"  // Oddments: Unroll 3: Oddment store
    "add x25, x25, x23\n"
    "add x24, x24, x23\n"
    "add x22, x22, x23\n"
    "add x21, x21, x23\n"
    "tbz x19, #1, 32f\n"
    "st1 { v30.h }[0], [x25], #0x2\n"
    "st1 { v22.h }[0], [x24], #0x2\n"
    "st1 { v20.h }[0], [x22], #0x2\n"
    "st1 { v19.h }[0], [x21], #0x2\n"
    "tbz x19, #0, 33f\n"
    "st1 { v30.b }[2], [x25], #0x1\n"
    "st1 { v22.b }[2], [x24], #0x1\n"
    "st1 { v20.b }[2], [x22], #0x1\n"
    "st1 { v19.b }[2], [x21], #0x1\n"
    "b 33f\n"
    "32:"  // Oddments: Unroll 3: Oddment store: Bit 1: Unset
    "tbz x19, #0, 33f\n"
    "st1 { v30.b }[0], [x25], #0x1\n"
    "st1 { v22.b }[0], [x24], #0x1\n"
    "st1 { v20.b }[0], [x22], #0x1\n"
    "st1 { v19.b }[0], [x21], #0x1\n"
    "33:"  // Oddments: Unroll 3: Oddment store: Bit 1: End

    "34:"  // End
    "add SP, SP, #0x80\n"
    : [params] "+&r" (params)
    : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
  );
}

}  // namespace depthwise
}  // namespace arm_conv

#endif  // defined(__aarch64__)
